## Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
%matplotlib inline
import seaborn as sns
import plotly.express as px
import scipy.stats as stats
# Import Cluster Analysis libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn import metrics
from sklearn.neighbors import NearestNeighbors
import scipy.cluster.hierarchy as sch
#import visualization libraries
from matplotlib.pyplot import scatter
import plotly
from plotly.graph_objs import Scatter, Marker, Layout, layout,XAxis, YAxis, Bar, Line
%matplotlib inline
# get ANOVA table as R like output
import statsmodels.api as sm
from statsmodels.formula.api import ols
# load the airline satisfaction dataset
df = pd.read_csv('https://raw.githubusercontent.com/dk28yip/MSDS7331/main/airline.csv') # read in the csv file
df.head()
#reduced samples set from 100,000 to 30,000 as a few of us had computer performance issues
df = df.sample(n=30000)
# Any missing values in the dataset
def plot_missingness(df: pd.DataFrame=df) -> None:
nan_df = pd.DataFrame(df.isna().sum()).reset_index()
nan_df.columns = ['Column', 'NaN_Count']
nan_df['NaN_Count'] = nan_df['NaN_Count'].astype('int')
nan_df['NaN_%'] = round(nan_df['NaN_Count']/df.shape[0] * 100,4)
nan_df['Type'] = 'Missingness'
nan_df.sort_values('NaN_%', inplace=True)
# Add completeness
for i in range(nan_df.shape[0]):
complete_df = pd.DataFrame([nan_df.loc[i,'Column'],df.shape[0] - nan_df.loc[i,'NaN_Count'],100 - nan_df.loc[i,'NaN_%'], 'Completeness']).T
complete_df.columns = ['Column','NaN_Count','NaN_%','Type']
complete_df['NaN_%'] = complete_df['NaN_%'].astype('int')
complete_df['NaN_Count'] = complete_df['NaN_Count'].astype('int')
nan_df = pd.concat([nan_df,complete_df], sort=True)
nan_df = nan_df.rename(columns={"Column": "Feature", "NaN_%": "Missing %"})
# Missingness Plot
fig = px.bar(nan_df,
x='Feature',
y='Missing %',
title=f"Missingness Plot (N={df.shape[0]})",
color='Type',
opacity = 0.6,
color_discrete_sequence=['red','#808080'],
width=800,
height=800)
fig.show()
plot_missingness(df)
print("Missing 99 values if the 'Arrival Delay in Minutes'column; approximately 0.31%.")
Missing 99 values if the 'Arrival Delay in Minutes'column; approximately 0.31%.
ID was removed from the dataset as it was used as a unique identified for each passenger
df["GenderNumeric"] = (df["Gender"]=="Male").astype(int)
df["CustomerTypeNumeric"] = (df["Customer Type"]=="Loyal Customer").astype(int)
df["TypeofTravelNumeric"] = (df["Type of Travel"]=="Personal Travel").astype(int)
df["ClassNumeric"] = df["Class"]
df["ClassNumeric"].replace(['Eco', 'Eco Plus', 'Business'], [0, 1, 2], inplace=True)
#Change Satisfaction to 1 or 0
df["SatisfactionNumeric"] = df["satisfaction"]
df["SatisfactionNumeric"].replace(['neutral or dissatisfied', 'satisfied'], [0,1], inplace = True)
df["Arrival Delay in Minutes"]= df["Arrival Delay in Minutes"].fillna(0)
dfclean = df.drop(columns=['id'])
dfclean.isnull().sum() #double check on the missing values - 'arrival delay in minutes =310'
Gender 0 Customer Type 0 Age 0 Type of Travel 0 Class 0 Flight Distance 0 Inflight wifi service 0 Departure/Arrival time convenient 0 Ease of Online booking 0 Gate location 0 Food and drink 0 Online boarding 0 Seat comfort 0 Inflight entertainment 0 On-board service 0 Leg room service 0 Baggage handling 0 Checkin service 0 Inflight service 0 Cleanliness 0 Departure Delay in Minutes 0 Arrival Delay in Minutes 0 satisfaction 0 GenderNumeric 0 CustomerTypeNumeric 0 TypeofTravelNumeric 0 ClassNumeric 0 SatisfactionNumeric 0 dtype: int64
#Fill in missing values
dfclean["Arrival Delay in Minutes"].fillna(dfclean["Arrival Delay in Minutes"].median(), inplace=True)
dfclean.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Age | 30000.0 | 39.299833 | 15.138575 | 7.0 | 27.0 | 40.0 | 51.0 | 85.0 |
| Flight Distance | 30000.0 | 1183.915033 | 993.241471 | 31.0 | 413.0 | 837.0 | 1727.0 | 4983.0 |
| Inflight wifi service | 30000.0 | 2.736200 | 1.324060 | 0.0 | 2.0 | 3.0 | 4.0 | 5.0 |
| Departure/Arrival time convenient | 30000.0 | 3.055433 | 1.526577 | 0.0 | 2.0 | 3.0 | 4.0 | 5.0 |
| Ease of Online booking | 30000.0 | 2.762267 | 1.399172 | 0.0 | 2.0 | 3.0 | 4.0 | 5.0 |
| Gate location | 30000.0 | 2.971367 | 1.279271 | 0.0 | 2.0 | 3.0 | 4.0 | 5.0 |
| Food and drink | 30000.0 | 3.203200 | 1.330903 | 0.0 | 2.0 | 3.0 | 4.0 | 5.0 |
| Online boarding | 30000.0 | 3.259167 | 1.346771 | 0.0 | 2.0 | 3.0 | 4.0 | 5.0 |
| Seat comfort | 30000.0 | 3.441400 | 1.321750 | 1.0 | 2.0 | 4.0 | 5.0 | 5.0 |
| Inflight entertainment | 30000.0 | 3.359267 | 1.335160 | 0.0 | 2.0 | 4.0 | 4.0 | 5.0 |
| On-board service | 30000.0 | 3.390300 | 1.286139 | 1.0 | 2.0 | 4.0 | 4.0 | 5.0 |
| Leg room service | 30000.0 | 3.353633 | 1.311907 | 0.0 | 2.0 | 4.0 | 4.0 | 5.0 |
| Baggage handling | 30000.0 | 3.634367 | 1.184480 | 1.0 | 3.0 | 4.0 | 5.0 | 5.0 |
| Checkin service | 30000.0 | 3.310167 | 1.261566 | 1.0 | 3.0 | 3.0 | 4.0 | 5.0 |
| Inflight service | 30000.0 | 3.650567 | 1.175092 | 1.0 | 3.0 | 4.0 | 5.0 | 5.0 |
| Cleanliness | 30000.0 | 3.289867 | 1.313964 | 0.0 | 2.0 | 3.0 | 4.0 | 5.0 |
| Departure Delay in Minutes | 30000.0 | 14.587367 | 37.485305 | 0.0 | 0.0 | 0.0 | 13.0 | 1592.0 |
| Arrival Delay in Minutes | 30000.0 | 14.876867 | 37.744196 | 0.0 | 0.0 | 0.0 | 13.0 | 1584.0 |
| GenderNumeric | 30000.0 | 0.491900 | 0.499943 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
| CustomerTypeNumeric | 30000.0 | 0.818067 | 0.385796 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 |
| TypeofTravelNumeric | 30000.0 | 0.309533 | 0.462309 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
| ClassNumeric | 30000.0 | 1.029267 | 0.961513 | 0.0 | 0.0 | 1.0 | 2.0 | 2.0 |
| SatisfactionNumeric | 30000.0 | 0.434633 | 0.495717 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
dfclean.corr()
| Age | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | Food and drink | Online boarding | Seat comfort | Inflight entertainment | ... | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | GenderNumeric | CustomerTypeNumeric | TypeofTravelNumeric | ClassNumeric | SatisfactionNumeric | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Age | 1.000000 | 0.100593 | 0.003958 | 0.030369 | 0.013305 | -0.009106 | 0.022379 | 0.199120 | 0.165753 | 0.082095 | ... | 0.030686 | -0.046155 | 0.055864 | -0.010463 | -0.013109 | 0.000808 | 0.285067 | -0.048445 | 0.139015 | 0.131959 |
| Flight Distance | 0.100593 | 1.000000 | -0.000385 | -0.021131 | 0.062276 | -0.001263 | 0.055368 | 0.209287 | 0.162458 | 0.127695 | ... | 0.069235 | 0.059287 | 0.092476 | -0.001503 | -0.005201 | -0.000801 | 0.222738 | -0.262669 | 0.445825 | 0.294396 |
| Inflight wifi service | 0.003958 | -0.000385 | 1.000000 | 0.348631 | 0.713397 | 0.339447 | 0.129750 | 0.453095 | 0.120003 | 0.209892 | ... | 0.039147 | 0.122411 | 0.126899 | -0.019791 | -0.020471 | 0.005182 | 0.001839 | -0.103161 | 0.036542 | 0.282816 |
| Departure/Arrival time convenient | 0.030369 | -0.021131 | 0.348631 | 1.000000 | 0.441277 | 0.451814 | -0.001164 | 0.076496 | 0.012538 | -0.008201 | ... | 0.091393 | 0.079052 | 0.012031 | 0.003782 | 0.001855 | 0.005371 | 0.204754 | 0.261917 | -0.095647 | -0.057916 |
| Ease of Online booking | 0.013305 | 0.062276 | 0.713397 | 0.441277 | 1.000000 | 0.461168 | 0.025925 | 0.401942 | 0.027057 | 0.040868 | ... | 0.007783 | 0.041945 | 0.011411 | -0.006769 | -0.008291 | -0.004421 | 0.014664 | -0.130555 | 0.100865 | 0.166857 |
| Gate location | -0.009106 | -0.001263 | 0.339447 | 0.451814 | 0.461168 | 1.000000 | -0.007723 | 0.002373 | 0.002113 | 0.001807 | ... | -0.037831 | 0.010574 | -0.006406 | 0.010107 | 0.008999 | -0.008520 | -0.005085 | -0.024355 | -0.004630 | -0.003030 |
| Food and drink | 0.022379 | 0.055368 | 0.129750 | -0.001164 | 0.025925 | -0.007723 | 1.000000 | 0.227616 | 0.571047 | 0.620251 | ... | 0.082793 | 0.034021 | 0.654333 | -0.029145 | -0.029670 | 0.011091 | 0.063238 | -0.061811 | 0.088374 | 0.206472 |
| Online boarding | 0.199120 | 0.209287 | 0.453095 | 0.076496 | 0.401942 | 0.002373 | 0.227616 | 1.000000 | 0.421061 | 0.283294 | ... | 0.203660 | 0.080922 | 0.328658 | -0.019264 | -0.018953 | -0.047702 | 0.182496 | -0.219595 | 0.320012 | 0.501488 |
| Seat comfort | 0.165753 | 0.162458 | 0.120003 | 0.012538 | 0.027057 | 0.002113 | 0.571047 | 0.421061 | 1.000000 | 0.611922 | ... | 0.188210 | 0.075164 | 0.678665 | -0.020214 | -0.019982 | -0.025361 | 0.160301 | -0.117333 | 0.222306 | 0.346646 |
| Inflight entertainment | 0.082095 | 0.127695 | 0.209892 | -0.008201 | 0.040868 | 0.001807 | 0.620251 | 0.283294 | 0.611922 | 1.000000 | ... | 0.117732 | 0.409977 | 0.692659 | -0.029836 | -0.032594 | 0.010702 | 0.113825 | -0.142957 | 0.193357 | 0.396598 |
| On-board service | 0.058279 | 0.106771 | 0.126215 | 0.066756 | 0.044895 | -0.020336 | 0.047940 | 0.159393 | 0.127334 | 0.415289 | ... | 0.241526 | 0.552500 | 0.116082 | -0.033152 | -0.038396 | 0.012564 | 0.050134 | -0.059781 | 0.216732 | 0.320444 |
| Leg room service | 0.036321 | 0.134119 | 0.164493 | 0.011050 | 0.111251 | -0.001415 | 0.028795 | 0.128322 | 0.101217 | 0.299136 | ... | 0.149636 | 0.365369 | 0.094268 | 0.018398 | 0.016444 | 0.036565 | 0.040909 | -0.146079 | 0.209177 | 0.316000 |
| Baggage handling | -0.047479 | 0.063869 | 0.123945 | 0.071179 | 0.043150 | 0.010316 | 0.030490 | 0.089077 | 0.077582 | 0.380983 | ... | 0.227521 | 0.627786 | 0.095130 | -0.000621 | -0.004204 | 0.040792 | -0.029370 | -0.032856 | 0.159693 | 0.245168 |
| Checkin service | 0.030686 | 0.069235 | 0.039147 | 0.091393 | 0.007783 | -0.037831 | 0.082793 | 0.203660 | 0.188210 | 0.117732 | ... | 1.000000 | 0.230424 | 0.170122 | -0.010827 | -0.011282 | 0.012202 | 0.026635 | 0.014505 | 0.148964 | 0.230787 |
| Inflight service | -0.046155 | 0.059287 | 0.122411 | 0.079052 | 0.041945 | 0.010574 | 0.034021 | 0.080922 | 0.075164 | 0.409977 | ... | 0.230424 | 1.000000 | 0.092114 | -0.052596 | -0.057721 | 0.042363 | -0.023618 | -0.023941 | 0.155858 | 0.244252 |
| Cleanliness | 0.055864 | 0.092476 | 0.126899 | 0.012031 | 0.011411 | -0.006406 | 0.654333 | 0.328658 | 0.678665 | 0.692659 | ... | 0.170122 | 0.092114 | 1.000000 | -0.012302 | -0.011373 | 0.007837 | 0.086413 | -0.074011 | 0.134180 | 0.305447 |
| Departure Delay in Minutes | -0.010463 | -0.001503 | -0.019791 | 0.003782 | -0.006769 | 0.010107 | -0.029145 | -0.019264 | -0.020214 | -0.029836 | ... | -0.010827 | -0.052596 | -0.012302 | 1.000000 | 0.957879 | 0.002560 | -0.008902 | -0.004886 | -0.012730 | -0.048815 |
| Arrival Delay in Minutes | -0.013109 | -0.005201 | -0.020471 | 0.001855 | -0.008291 | 0.008999 | -0.029670 | -0.018953 | -0.019982 | -0.032594 | ... | -0.011282 | -0.057721 | -0.011373 | 0.957879 | 1.000000 | 0.000762 | -0.008782 | -0.005791 | -0.015227 | -0.053898 |
| GenderNumeric | 0.000808 | -0.000801 | 0.005182 | 0.005371 | -0.004421 | -0.008520 | 0.011091 | -0.047702 | -0.025361 | 0.010702 | ... | 0.012202 | 0.042363 | 0.007837 | 0.002560 | 0.000762 | 1.000000 | 0.023814 | 0.004935 | 0.004723 | 0.013601 |
| CustomerTypeNumeric | 0.285067 | 0.222738 | 0.001839 | 0.204754 | 0.014664 | -0.005085 | 0.063238 | 0.182496 | 0.160301 | 0.113825 | ... | 0.026635 | -0.023618 | 0.086413 | -0.008902 | -0.008782 | 0.023814 | 1.000000 | 0.308275 | 0.106015 | 0.181662 |
| TypeofTravelNumeric | -0.048445 | -0.262669 | -0.103161 | 0.261917 | -0.130555 | -0.024355 | -0.061811 | -0.219595 | -0.117333 | -0.142957 | ... | 0.014505 | -0.023941 | -0.074011 | -0.004886 | -0.005791 | 0.004935 | 0.308275 | 1.000000 | -0.541338 | -0.454110 |
| ClassNumeric | 0.139015 | 0.445825 | 0.036542 | -0.095647 | 0.100865 | -0.004630 | 0.088374 | 0.320012 | 0.222306 | 0.193357 | ... | 0.148964 | 0.155858 | 0.134180 | -0.012730 | -0.015227 | 0.004723 | 0.106015 | -0.541338 | 1.000000 | 0.498255 |
| SatisfactionNumeric | 0.131959 | 0.294396 | 0.282816 | -0.057916 | 0.166857 | -0.003030 | 0.206472 | 0.501488 | 0.346646 | 0.396598 | ... | 0.230787 | 0.244252 | 0.305447 | -0.048815 | -0.053898 | 0.013601 | 0.181662 | -0.454110 | 0.498255 | 1.000000 |
23 rows × 23 columns
f, ax = plt.subplots(figsize=[18, 13])
sns.heatmap(dfclean.corr(), annot=True, fmt=".2f", ax=ax, cmap="bwr")
ax.set_title("Correlation Matrix", fontsize=20)
plt.show()
Very strong correlations (values from 0.8 to 1 or -0.8 to -1.0) Strong correlations (values from 0.6 to 0.8 or -0.6 to -0.8) Moderate correlations (values from 0.4 to 0.6 or -0.4 to -0.6)
##distribution of the data
for column in dfclean:
sns.displot(x=column, data=dfclean)
C:\Users\thud1\anaconda3\envs\ML7331\lib\site-packages\seaborn\axisgrid.py:409: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
print (dfclean.info())
<class 'pandas.core.frame.DataFrame'> Int64Index: 30000 entries, 42798 to 27683 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 30000 non-null object 1 Customer Type 30000 non-null object 2 Age 30000 non-null int64 3 Type of Travel 30000 non-null object 4 Class 30000 non-null object 5 Flight Distance 30000 non-null int64 6 Inflight wifi service 30000 non-null int64 7 Departure/Arrival time convenient 30000 non-null int64 8 Ease of Online booking 30000 non-null int64 9 Gate location 30000 non-null int64 10 Food and drink 30000 non-null int64 11 Online boarding 30000 non-null int64 12 Seat comfort 30000 non-null int64 13 Inflight entertainment 30000 non-null int64 14 On-board service 30000 non-null int64 15 Leg room service 30000 non-null int64 16 Baggage handling 30000 non-null int64 17 Checkin service 30000 non-null int64 18 Inflight service 30000 non-null int64 19 Cleanliness 30000 non-null int64 20 Departure Delay in Minutes 30000 non-null int64 21 Arrival Delay in Minutes 30000 non-null float64 22 satisfaction 30000 non-null object 23 GenderNumeric 30000 non-null int32 24 CustomerTypeNumeric 30000 non-null int32 25 TypeofTravelNumeric 30000 non-null int32 26 ClassNumeric 30000 non-null int64 27 SatisfactionNumeric 30000 non-null int64 dtypes: float64(1), int32(3), int64(19), object(5) memory usage: 7.3+ MB None
A total of +100,000 passenger results are recorded in this data set. We have a combination of categorical, ordinal and continous variable in this dataset.
#Sum Survey Score Columns - index(6:19)
dfclean['Survey_Total']=dfclean.iloc[:,6:19].sum(axis=1)
#Add Age Groups
dfclean['Age_Group'] = pd.cut(dfclean['Age'], bins = [0,17,35,65,90], labels=["Adolescent", "Young Adult", "Middle Age", "Senior"], right=True)
dfclean['Age_Group_Numeric'] = pd.cut(dfclean['Age'], bins = [0,17,35,65,90], labels=["1", "2", "3", "4"], right=True)
#Add Distance Groups
dfclean['Distance_Group'] = pd.cut(dfclean['Flight Distance'], bins = [0,500,1000,2500,5000], labels=["Short", "Moderate", "Long", "Extra Long"], right=True)
dfclean['Distance_Group_Numeric'] = pd.cut(dfclean['Flight Distance'], bins = [0,500,1000,2500,5000], labels=["1", "2", "3", "4"], right=True)
#Add Departure Delay Groups
dfclean['Departure_Delay_Group'] = pd.cut(dfclean['Departure Delay in Minutes'], bins = [-1,1,100,1600], labels=["None","Minimal","Long"], right=True)
dfclean['Departure_Delay_Group_Numeric'] = pd.cut(dfclean['Departure Delay in Minutes'], bins = [-1,1,100,1600], labels=["0","1","2"], right=True)
dfclean.head(n=50)
| Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | ... | TypeofTravelNumeric | ClassNumeric | SatisfactionNumeric | Survey_Total | Age_Group | Age_Group_Numeric | Distance_Group | Distance_Group_Numeric | Departure_Delay_Group | Departure_Delay_Group_Numeric | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 42798 | Male | Loyal Customer | 8 | Business travel | Eco Plus | 2586 | 2 | 5 | 5 | 5 | ... | 0 | 1 | 0 | 36 | Adolescent | 1 | Extra Long | 4 | Minimal | 1 |
| 99439 | Male | Loyal Customer | 60 | Business travel | Business | 3262 | 1 | 1 | 1 | 1 | ... | 0 | 2 | 1 | 46 | Middle Age | 3 | Extra Long | 4 | Minimal | 1 |
| 27324 | Male | Loyal Customer | 35 | Personal Travel | Eco | 259 | 3 | 1 | 3 | 4 | ... | 1 | 0 | 0 | 37 | Young Adult | 2 | Short | 1 | None | 0 |
| 62959 | Male | Loyal Customer | 53 | Business travel | Business | 794 | 2 | 2 | 2 | 2 | ... | 0 | 2 | 1 | 49 | Middle Age | 3 | Moderate | 2 | Minimal | 1 |
| 60767 | Female | disloyal Customer | 26 | Business travel | Eco | 784 | 3 | 3 | 3 | 5 | ... | 0 | 0 | 0 | 42 | Young Adult | 2 | Moderate | 2 | None | 0 |
| 806 | Female | Loyal Customer | 26 | Personal Travel | Eco | 908 | 3 | 4 | 3 | 4 | ... | 1 | 0 | 0 | 35 | Young Adult | 2 | Moderate | 2 | None | 0 |
| 30692 | Male | Loyal Customer | 50 | Personal Travel | Eco | 405 | 3 | 5 | 3 | 1 | ... | 1 | 0 | 0 | 51 | Middle Age | 3 | Short | 1 | Minimal | 1 |
| 44695 | Female | Loyal Customer | 47 | Business travel | Business | 3506 | 4 | 4 | 4 | 4 | ... | 0 | 2 | 1 | 44 | Middle Age | 3 | Extra Long | 4 | Minimal | 1 |
| 51878 | Male | Loyal Customer | 38 | Business travel | Eco | 231 | 3 | 2 | 2 | 2 | ... | 0 | 0 | 0 | 39 | Middle Age | 3 | Short | 1 | None | 0 |
| 16194 | Female | Loyal Customer | 7 | Personal Travel | Eco | 209 | 5 | 3 | 5 | 4 | ... | 1 | 0 | 1 | 49 | Adolescent | 1 | Short | 1 | None | 0 |
| 83004 | Male | Loyal Customer | 26 | Business travel | Eco Plus | 533 | 3 | 2 | 2 | 2 | ... | 0 | 1 | 0 | 36 | Young Adult | 2 | Moderate | 2 | None | 0 |
| 96341 | Male | Loyal Customer | 53 | Business travel | Business | 2069 | 3 | 3 | 3 | 3 | ... | 0 | 2 | 1 | 52 | Middle Age | 3 | Long | 3 | None | 0 |
| 41658 | Female | Loyal Customer | 62 | Business travel | Business | 3272 | 4 | 4 | 4 | 4 | ... | 0 | 2 | 1 | 48 | Middle Age | 3 | Extra Long | 4 | Minimal | 1 |
| 33411 | Male | Loyal Customer | 57 | Personal Travel | Eco | 337 | 2 | 4 | 2 | 5 | ... | 1 | 0 | 0 | 39 | Middle Age | 3 | Short | 1 | None | 0 |
| 38872 | Male | Loyal Customer | 54 | Business travel | Business | 3667 | 4 | 1 | 1 | 1 | ... | 0 | 2 | 0 | 42 | Middle Age | 3 | Extra Long | 4 | None | 0 |
| 35012 | Female | disloyal Customer | 65 | Business travel | Eco Plus | 373 | 1 | 1 | 1 | 4 | ... | 0 | 1 | 0 | 23 | Middle Age | 3 | Short | 1 | Minimal | 1 |
| 63879 | Male | Loyal Customer | 30 | Business travel | Business | 423 | 4 | 5 | 5 | 5 | ... | 0 | 2 | 0 | 50 | Young Adult | 2 | Short | 1 | Minimal | 1 |
| 93161 | Male | disloyal Customer | 30 | Business travel | Eco | 377 | 2 | 2 | 2 | 4 | ... | 0 | 0 | 0 | 44 | Young Adult | 2 | Short | 1 | Minimal | 1 |
| 102800 | Male | Loyal Customer | 33 | Business travel | Business | 525 | 5 | 5 | 5 | 5 | ... | 0 | 2 | 1 | 52 | Young Adult | 2 | Moderate | 2 | None | 0 |
| 37013 | Male | disloyal Customer | 22 | Business travel | Eco | 1846 | 2 | 2 | 2 | 4 | ... | 0 | 0 | 0 | 33 | Young Adult | 2 | Long | 3 | Minimal | 1 |
| 22720 | Male | disloyal Customer | 21 | Business travel | Eco | 230 | 5 | 3 | 5 | 5 | ... | 0 | 0 | 1 | 40 | Young Adult | 2 | Short | 1 | Minimal | 1 |
| 35870 | Male | Loyal Customer | 13 | Business travel | Business | 954 | 3 | 3 | 3 | 3 | ... | 0 | 2 | 1 | 45 | Adolescent | 1 | Moderate | 2 | None | 0 |
| 16044 | Female | Loyal Customer | 26 | Business travel | Eco Plus | 575 | 3 | 3 | 2 | 3 | ... | 0 | 1 | 0 | 39 | Young Adult | 2 | Moderate | 2 | None | 0 |
| 95924 | Male | Loyal Customer | 33 | Business travel | Business | 598 | 3 | 3 | 3 | 3 | ... | 0 | 2 | 1 | 47 | Young Adult | 2 | Moderate | 2 | None | 0 |
| 33563 | Female | Loyal Customer | 59 | Personal Travel | Eco | 944 | 4 | 2 | 3 | 3 | ... | 1 | 0 | 1 | 38 | Middle Age | 3 | Moderate | 2 | Minimal | 1 |
| 24683 | Male | Loyal Customer | 19 | Personal Travel | Eco | 304 | 4 | 4 | 4 | 3 | ... | 1 | 0 | 0 | 43 | Young Adult | 2 | Short | 1 | Minimal | 1 |
| 49016 | Female | disloyal Customer | 37 | Business travel | Eco | 720 | 3 | 3 | 3 | 1 | ... | 0 | 0 | 0 | 32 | Middle Age | 3 | Moderate | 2 | None | 0 |
| 84076 | Male | disloyal Customer | 18 | Business travel | Eco | 1365 | 1 | 1 | 1 | 4 | ... | 0 | 0 | 0 | 27 | Young Adult | 2 | Long | 3 | None | 0 |
| 5971 | Male | Loyal Customer | 59 | Business travel | Eco | 221 | 4 | 3 | 3 | 3 | ... | 0 | 0 | 1 | 37 | Middle Age | 3 | Short | 1 | None | 0 |
| 102653 | Male | disloyal Customer | 26 | Business travel | Business | 1744 | 4 | 4 | 4 | 2 | ... | 0 | 2 | 0 | 40 | Young Adult | 2 | Long | 3 | None | 0 |
| 31170 | Female | Loyal Customer | 47 | Personal Travel | Eco Plus | 453 | 3 | 4 | 3 | 3 | ... | 1 | 1 | 0 | 45 | Middle Age | 3 | Short | 1 | None | 0 |
| 12672 | Male | disloyal Customer | 17 | Business travel | Eco | 370 | 4 | 5 | 4 | 3 | ... | 0 | 0 | 1 | 51 | Adolescent | 1 | Short | 1 | None | 0 |
| 51107 | Female | Loyal Customer | 60 | Business travel | Business | 1145 | 5 | 5 | 5 | 5 | ... | 0 | 2 | 1 | 64 | Middle Age | 3 | Long | 3 | Minimal | 1 |
| 74968 | Female | Loyal Customer | 62 | Personal Travel | Eco | 1919 | 5 | 4 | 5 | 4 | ... | 1 | 0 | 1 | 45 | Middle Age | 3 | Long | 3 | None | 0 |
| 46611 | Male | disloyal Customer | 20 | Business travel | Eco | 1056 | 3 | 3 | 3 | 3 | ... | 0 | 0 | 0 | 44 | Young Adult | 2 | Long | 3 | Minimal | 1 |
| 66407 | Female | Loyal Customer | 60 | Business travel | Business | 888 | 3 | 3 | 3 | 3 | ... | 0 | 2 | 1 | 48 | Middle Age | 3 | Moderate | 2 | None | 0 |
| 83718 | Male | Loyal Customer | 60 | Business travel | Eco Plus | 1076 | 5 | 3 | 2 | 2 | ... | 0 | 1 | 1 | 48 | Middle Age | 3 | Long | 3 | None | 0 |
| 35556 | Female | Loyal Customer | 19 | Personal Travel | Business | 547 | 3 | 2 | 3 | 3 | ... | 1 | 2 | 0 | 41 | Young Adult | 2 | Moderate | 2 | Minimal | 1 |
| 53311 | Female | disloyal Customer | 20 | Business travel | Eco | 284 | 1 | 3 | 1 | 3 | ... | 0 | 0 | 0 | 33 | Young Adult | 2 | Short | 1 | Long | 2 |
| 11252 | Female | Loyal Customer | 69 | Personal Travel | Eco | 341 | 2 | 5 | 2 | 3 | ... | 1 | 0 | 0 | 40 | Senior | 4 | Short | 1 | None | 0 |
| 21211 | Female | disloyal Customer | 30 | Business travel | Business | 912 | 0 | 0 | 0 | 5 | ... | 0 | 2 | 1 | 31 | Young Adult | 2 | Moderate | 2 | None | 0 |
| 51368 | Male | Loyal Customer | 52 | Business travel | Business | 3112 | 1 | 1 | 1 | 1 | ... | 0 | 2 | 1 | 48 | Middle Age | 3 | Extra Long | 4 | Minimal | 1 |
| 99523 | Female | disloyal Customer | 22 | Business travel | Eco | 361 | 4 | 4 | 4 | 3 | ... | 0 | 0 | 0 | 49 | Young Adult | 2 | Short | 1 | None | 0 |
| 83792 | Female | Loyal Customer | 54 | Business travel | Business | 2386 | 3 | 5 | 5 | 5 | ... | 0 | 2 | 0 | 42 | Middle Age | 3 | Long | 3 | Minimal | 1 |
| 37433 | Female | Loyal Customer | 46 | Business travel | Business | 1620 | 4 | 4 | 4 | 4 | ... | 0 | 2 | 1 | 54 | Middle Age | 3 | Long | 3 | None | 0 |
| 29894 | Male | Loyal Customer | 47 | Business travel | Eco | 429 | 5 | 2 | 2 | 2 | ... | 0 | 0 | 1 | 47 | Middle Age | 3 | Short | 1 | None | 0 |
| 34124 | Female | disloyal Customer | 42 | Business travel | Eco | 776 | 3 | 3 | 3 | 3 | ... | 0 | 0 | 0 | 34 | Middle Age | 3 | Moderate | 2 | None | 0 |
| 46624 | Female | Loyal Customer | 64 | Business travel | Business | 1709 | 0 | 0 | 0 | 2 | ... | 0 | 2 | 1 | 38 | Middle Age | 3 | Long | 3 | Minimal | 1 |
| 3544 | Male | Loyal Customer | 55 | Business travel | Business | 1787 | 2 | 2 | 2 | 2 | ... | 0 | 2 | 1 | 49 | Middle Age | 3 | Long | 3 | Minimal | 1 |
| 66706 | Female | Loyal Customer | 68 | Personal Travel | Eco | 746 | 3 | 4 | 2 | 3 | ... | 1 | 0 | 0 | 50 | Senior | 4 | Moderate | 2 | None | 0 |
50 rows × 35 columns
#Plotly Histogram - Age
px.histogram(dfclean['Age'], x= "Age",
title = 'Age Distribution',
)
#Plotly Histogram - Flight Distance
px.histogram(dfclean['Flight Distance'], x= "Flight Distance",
title = 'Flight Distance Distribution',
)
#Plotly Histogram - Departure Delay in Minutes
px.histogram(dfclean['Departure Delay in Minutes'], x= "Departure Delay in Minutes",
title = 'Departure Delay Distribution',
nbins = 50,
)
#Plotly Histogram - Arrival Delay in Minutes
px.histogram(dfclean['Arrival Delay in Minutes'], x= "Arrival Delay in Minutes",
title = 'Arrival Delay Distribution',
nbins = 50,
)
px.box(dfclean,
x="Class", y="Age",
color="Class",
title = "Box Plot of Age by Class")
#ANOVA of class to age
# Ordinary Least Squares (OLS) model
model = ols('Age ~ C(Class)', data=dfclean).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table
| sum_sq | df | F | PR(>F) | |
|---|---|---|---|---|
| C(Class) | 1.340587e+05 | 2.0 | 298.275814 | 5.391536e-129 |
| Residual | 6.741005e+06 | 29997.0 | NaN | NaN |
The ANOVA between age and class shows overwhelming evidence that there is a difference between the mean age of travelers among the various classes.
px.box(dfclean,
x="Class", y="Survey_Total",
color="Class",
title = "Box Plot of Survey Total by Class")
#ANOVA of Class to Total Survey Score
# Ordinary Least Squares (OLS) model
model = ols('Survey_Total ~ C(Class)', data=dfclean).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table
| sum_sq | df | F | PR(>F) | |
|---|---|---|---|---|
| C(Class) | 1.661443e+05 | 2.0 | 1224.53601 | 0.0 |
| Residual | 2.034987e+06 | 29997.0 | NaN | NaN |
The ANOVA between class and survey score shows overwhelming evidence that there is a difference between the mean survey score among the classes.
px.box(dfclean,
x="Distance_Group", y="Survey_Total",
color="Distance_Group",
title = "Box Plot of Distance Traveled by Survey Total")
#ANOVA of age to Total Survey Score
# Ordinary Least Squares (OLS) model
model = ols('Survey_Total ~ C(Distance_Group)', data=dfclean).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table
| sum_sq | df | F | PR(>F) | |
|---|---|---|---|---|
| C(Distance_Group) | 4.747145e+04 | 3.0 | 220.392802 | 1.969810e-141 |
| Residual | 2.153660e+06 | 29996.0 | NaN | NaN |
The ANOVA between distance group and survey score shows overwhelming evidence that there is a difference between the mean survey score among the distance groups.
#ANOVA of class to Total Survey Score
# Ordinary Least Squares (OLS) model
model = ols('Survey_Total ~ C(Age)', data=dfclean).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table
| sum_sq | df | F | PR(>F) | |
|---|---|---|---|---|
| C(Age) | 7.204341e+04 | 74.0 | 13.683684 | 1.190214e-161 |
| Residual | 2.129088e+06 | 29925.0 | NaN | NaN |
px.box(dfclean,
x="Departure_Delay_Group", y="Survey_Total",
color="Departure_Delay_Group",
title = "Box Plot of Total by Departure Delay Group")
#ANOVA of age to Total Survey Score
# Ordinary Least Squares (OLS) model
model = ols('Survey_Total ~ C(Departure_Delay_Group)', data=dfclean).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table
| sum_sq | df | F | PR(>F) | |
|---|---|---|---|---|
| C(Departure_Delay_Group) | 1.489027e+03 | 2.0 | 10.153092 | 0.000039 |
| Residual | 2.199643e+06 | 29997.0 | NaN | NaN |
The ANOVA between departure delay group and survey score shows overwhelming evidence that there is a difference between the mean survey score among the distance groups.
#2 Sample T-Test - Male vs Female
# Subset data
male = dfclean.query('Gender == "Male"')['Survey_Total']
female = dfclean.query('Gender == "Female"')['Survey_Total']
dfclean.groupby('Gender').describe()
| Age | Flight Distance | ... | SatisfactionNumeric | Survey_Total | |||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | ... | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
| Gender | |||||||||||||||||||||
| Female | 15243.0 | 39.287804 | 15.115859 | 7.0 | 27.0 | 40.0 | 51.0 | 85.0 | 15243.0 | 1184.697960 | ... | 1.0 | 1.0 | 15243.0 | 42.022174 | 8.632066 | 14.0 | 36.0 | 42.0 | 48.0 | 65.0 |
| Male | 14757.0 | 39.312259 | 15.162504 | 7.0 | 27.0 | 40.0 | 51.0 | 85.0 | 14757.0 | 1183.106322 | ... | 1.0 | 1.0 | 14757.0 | 42.235956 | 8.495803 | 14.0 | 36.0 | 42.0 | 48.0 | 65.0 |
2 rows × 192 columns
#Check normality assumption
#Plotly Histogram - Total
px.histogram(dfclean, x= "Survey_Total",
title = 'Survey Total Distribution',
facet_col="Gender"
)
#Check Variance Assumptions
stats.levene(male, female)
LeveneResult(statistic=3.970966701611184, pvalue=0.046300234377207254)
#T-Test
res = stats.ttest_ind(male, female, equal_var=True)
display(res)
Ttest_indResult(statistic=0.3458585763053265, pvalue=0.7294514263123113)
We failed to reject the null hypothesis at alpha 0.05, as such, based on this analysis we do not see a difference in means for survey scores between gender.
#set data to be ready for cluster analysis
cluster_df = dfclean.copy()
continuous_features = ['Flight Distance','Departure Delay in Minutes','Arrival Delay in Minutes']
# and the oridnal values to be integers
ordinal_features = ['Age','Inflight wifi service','Departure/Arrival time convenient']
# we won't touch these variables, keep them as categorical
categ_features = ['GenderNumeric','TypeofTravelNumeric','CustomerTypeNumeric', 'SatisfactionNumeric'];
# use the "astype" function to change the variable type
cluster_df[continuous_features] = cluster_df[continuous_features].astype(np.float64)
cluster_df[ordinal_features] = cluster_df[ordinal_features].astype(np.int64)
cluster_df = cluster_df.drop(['Age_Group','Distance_Group','Distance_Group_Numeric','Gender','Customer Type','Type of Travel', 'Class','Departure_Delay_Group','Departure_Delay_Group_Numeric','Age_Group_Numeric', 'satisfaction'], axis=1)
cluster_df.info() # now our data looks better!!
<class 'pandas.core.frame.DataFrame'> Int64Index: 30000 entries, 16096 to 86438 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 30000 non-null int64 1 Flight Distance 30000 non-null float64 2 Inflight wifi service 30000 non-null int64 3 Departure/Arrival time convenient 30000 non-null int64 4 Ease of Online booking 30000 non-null int64 5 Gate location 30000 non-null int64 6 Food and drink 30000 non-null int64 7 Online boarding 30000 non-null int64 8 Seat comfort 30000 non-null int64 9 Inflight entertainment 30000 non-null int64 10 On-board service 30000 non-null int64 11 Leg room service 30000 non-null int64 12 Baggage handling 30000 non-null int64 13 Checkin service 30000 non-null int64 14 Inflight service 30000 non-null int64 15 Cleanliness 30000 non-null int64 16 Departure Delay in Minutes 30000 non-null float64 17 Arrival Delay in Minutes 30000 non-null float64 18 GenderNumeric 30000 non-null int32 19 CustomerTypeNumeric 30000 non-null int32 20 TypeofTravelNumeric 30000 non-null int32 21 ClassNumeric 30000 non-null int64 22 SatisfactionNumeric 30000 non-null int64 23 Survey_Total 30000 non-null int64 dtypes: float64(3), int32(3), int64(18) memory usage: 6.4 MB
#scale values
cluster_df_scaled = StandardScaler().fit(cluster_df).transform(cluster_df.astype(float))
# finding optimal cluster amount
wcss = []
for i in range(1,20):
model = KMeans(n_clusters = i, init = "k-means++", random_state=50)
model.fit(cluster_df_scaled)
wcss.append(model.inertia_)
plt.figure(figsize=(10,10))
plt.plot(range(1,20), wcss)
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
Based on the WCSS curve, we see the it stars to shallow around 7.5. To ensure that we do not over fit our clustering models we will use 8 as the clusters for our PCA components. Any more may impact the models performance.
Utilizing PCA, we selected 2 PCA components to allow for easier interpretation and visualization of the clusters.
# reducing dimensionality and create new data based on our PCA.
pca = PCA(2)
data = pca.fit_transform(cluster_df_scaled)
data
array([[ 1.84267024, -2.00373957],
[ 2.20544825, -0.90255478],
[ 0.088318 , -1.11898053],
...,
[ 4.19832338, 0.55916796],
[-0.19883972, 2.05908113],
[ 2.00110632, 0.19032105]])
# clustering with PCA components
model = KMeans(n_clusters = 8, init = "k-means++")
label = model.fit_predict(data)
plt.figure(figsize=(10,10))
uniq = np.unique(label)
for i in uniq:
plt.scatter(data[label == i , 0] , data[label == i , 1] , label = i)
plt.legend()
plt.title('KMeans Clustering', fontsize=15)
plt.show()
metrics.silhouette_score(cluster_df_scaled, label)
0.04467975673601045
We use the Silhouette score to measure how dense and well-separated the clusters are for our data set. The silhouette score of 1 means that the clusters are very dense and nicely separated. The score of 0 means that clusters are overlapping. The score of less than 0 means that data belonging to clusters may be wrong/incorrect. In this instance we have a Silhouette score 0.04467. Since its not negative and the visual show distinct clustering we determined that our PCA scaled cluster is adequate.
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(data)
distances, indices = nbrs.kneighbors(data)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
[<matplotlib.lines.Line2D at 0x29ae75181c0>]
We determined that eps for the dbscan starts to have dimishing results at 0.15. As such, for our DBScan clustering we will use 0.2 for this analysis.
# DBScan Clustering with PCA
dbscan = DBSCAN(eps=0.2)
dbscan_label = dbscan.fit_predict(data)
plt.figure(figsize=(10,10))
uniq = np.unique(dbscan_label)
for i in uniq:
plt.scatter(data[label == i , 0] , data[label == i , 1] , label = i)
plt.legend()
plt.title('DBSCAN Clustering', fontsize=15)
plt.show()
metrics.silhouette_score(cluster_df_scaled, dbscan_label)
-0.0652248129630493
In the Dbscan instance we have a Silhouette score -0.65. Although visually we see great seperatation, the Silhouette score is negative which would suggest that the data belonging to clusters may be wrong/incorrect. As such, we can discount DBscan from our final model.
# utilizing a dendrogram to find the optimal number of clusters
dendrogram = sch.dendrogram(sch.linkage(data, method='ward'))
plt.title('Dendrogram')
plt.show()
hac = AgglomerativeClustering(n_clusters=5, affinity='euclidean',linkage='ward')
hac_label = hac.fit_predict(data)
plt.figure(figsize=(10,10))
uniq = np.unique(hac_label)
for i in uniq:
plt.scatter(data[label == i , 0] , data[label == i , 1] , label = i)
plt.legend()
plt.title('HAC Clustering', fontsize=15)
plt.show()
metrics.silhouette_score(cluster_df_scaled, hac_label)
0.07435444471933086
In the HAC instance we have a Silhouette score 0.07435. Since its not negative and the visual show distinct clustering we determined that our PCA scaled cluster is adequate.
result_df = dfclean.copy()
result_df['kmeans_cluster'] = label
result_df['dbscan_cluster'] = dbscan_label
result_df['hac_cluster'] = hac_label
result_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 30000 entries, 16096 to 86438 Data columns (total 38 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 30000 non-null object 1 Customer Type 30000 non-null object 2 Age 30000 non-null int64 3 Type of Travel 30000 non-null object 4 Class 30000 non-null object 5 Flight Distance 30000 non-null int64 6 Inflight wifi service 30000 non-null int64 7 Departure/Arrival time convenient 30000 non-null int64 8 Ease of Online booking 30000 non-null int64 9 Gate location 30000 non-null int64 10 Food and drink 30000 non-null int64 11 Online boarding 30000 non-null int64 12 Seat comfort 30000 non-null int64 13 Inflight entertainment 30000 non-null int64 14 On-board service 30000 non-null int64 15 Leg room service 30000 non-null int64 16 Baggage handling 30000 non-null int64 17 Checkin service 30000 non-null int64 18 Inflight service 30000 non-null int64 19 Cleanliness 30000 non-null int64 20 Departure Delay in Minutes 30000 non-null int64 21 Arrival Delay in Minutes 30000 non-null float64 22 satisfaction 30000 non-null object 23 GenderNumeric 30000 non-null int32 24 CustomerTypeNumeric 30000 non-null int32 25 TypeofTravelNumeric 30000 non-null int32 26 ClassNumeric 30000 non-null int64 27 SatisfactionNumeric 30000 non-null int64 28 Survey_Total 30000 non-null int64 29 Age_Group 30000 non-null category 30 Age_Group_Numeric 30000 non-null category 31 Distance_Group 30000 non-null category 32 Distance_Group_Numeric 30000 non-null category 33 Departure_Delay_Group 30000 non-null category 34 Departure_Delay_Group_Numeric 30000 non-null category 35 kmeans_cluster 30000 non-null int32 36 dbscan_cluster 30000 non-null int64 37 hac_cluster 30000 non-null int64 dtypes: category(6), float64(1), int32(4), int64(22), object(5) memory usage: 8.3+ MB
# clusters compared to Gender
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
fig.suptitle('Clustering Methods and Flight Distance')
# Kmeans
sns.boxplot(ax=axes[0], x=result_df['kmeans_cluster'].values, y=result_df['Flight Distance'].values)
axes[0].set_title('KMeans')
# DBSCAN
sns.boxplot(ax=axes[1], x=result_df['dbscan_cluster'].values, y=result_df['Flight Distance'].values)
axes[1].set_title('DBSCAN')
# HAC
sns.boxplot(ax=axes[2], x=result_df['hac_cluster'].values, y=result_df['Flight Distance'].values)
axes[2].set_title('HAC')
Text(0.5, 1.0, 'HAC')
We did not observe that flight distance was signifcant different between the clusters.
# clusters compared to Survey Total
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
fig.suptitle('Clustering Methods and Gender')
# Kmeans
sns.boxplot(ax=axes[0], x=result_df['kmeans_cluster'].values, y=result_df['Survey_Total'].values)
axes[0].set_title('KMeans')
# DBSCAN
sns.boxplot(ax=axes[1], x=result_df['dbscan_cluster'].values, y=result_df['Survey_Total'].values)
axes[1].set_title('DBSCAN')
# HAC
sns.boxplot(ax=axes[2], x=result_df['hac_cluster'].values, y=result_df['Survey_Total'].values)
axes[2].set_title('HAC')
Text(0.5, 1.0, 'HAC')
We have already discarded DBScan model in our previous note. Based on the boxplots, the KMeans model showed the best seperation (variance between clusters) and we determine that our Kmean model a good candidate to determine customer satifaction.
HAC and Kmean models proved effective at creating clearly defined groups that customers could then extropolate back to the original data to find unclear relationships to customer satifcation. We determine that Kmeans model achieved our goal with the clear seperations in the clusters.
This type of model is useful because we are able to take a data driven approach to generate behavioral benchmarks to show how one responds to their satisfaction to the airline. We identified this in Kmeans model has the ability to identify actual dissatisfaction or potential anomalies.
Deployment of this method can be done be sharing this code to interested airline companies. We will be required to refactor the data on the company's record. In addition, we can provide airline companies to incorporate survey requirements to ensure that the model captures the relevant needs to the model
Other additional data that we may think will impact the data set;
The cadence of these reports should coincide to quarterly financial reports as this can be utilized to assess and course correct any issues to the financial plan as low customer satisfaction impacts both top and bottom line of the P&L.
You have free reign to provide additional analyses or combine analyses
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
cluster_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 30000 entries, 16096 to 86438 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 30000 non-null int64 1 Flight Distance 30000 non-null float64 2 Inflight wifi service 30000 non-null int64 3 Departure/Arrival time convenient 30000 non-null int64 4 Ease of Online booking 30000 non-null int64 5 Gate location 30000 non-null int64 6 Food and drink 30000 non-null int64 7 Online boarding 30000 non-null int64 8 Seat comfort 30000 non-null int64 9 Inflight entertainment 30000 non-null int64 10 On-board service 30000 non-null int64 11 Leg room service 30000 non-null int64 12 Baggage handling 30000 non-null int64 13 Checkin service 30000 non-null int64 14 Inflight service 30000 non-null int64 15 Cleanliness 30000 non-null int64 16 Departure Delay in Minutes 30000 non-null float64 17 Arrival Delay in Minutes 30000 non-null float64 18 GenderNumeric 30000 non-null int32 19 CustomerTypeNumeric 30000 non-null int32 20 TypeofTravelNumeric 30000 non-null int32 21 ClassNumeric 30000 non-null int64 22 SatisfactionNumeric 30000 non-null int64 23 Survey_Total 30000 non-null int64 dtypes: float64(3), int32(3), int64(18) memory usage: 6.4 MB
# Standardize data
scaler = StandardScaler()
scaled_df = scaler.fit_transform(cluster_df)
# Normalizing the Data
normalized_df = normalize(scaled_df)
# Converting the numpy array into a pandas DataFrame
normalized_df = pd.DataFrame(normalized_df)
n_range = range(2,20)
# Create empty lists to store the BIC and AIC values
bic_score = []
aic_score = []
for n in n_range:
gm = GaussianMixture(n_components=n,
random_state=123,
n_init=10)
gm.fit(normalized_df)
# Append the BIC and AIC to the respective lists
bic_score.append(gm.bic(normalized_df))
aic_score.append(gm.aic(normalized_df))
fig, ax = plt.subplots(figsize=(12,8),nrows=1)
ax.plot(n_range, bic_score, '-o', color='orange')
ax.plot(n_range, aic_score, '-o', color='green')
ax.set(xlabel='Number of Clusters', ylabel='Score')
ax.set_xticks(n_range)
ax.set_title('BIC and AIC Scores Per Number Of Clusters')
Text(0.5, 1.0, 'BIC and AIC Scores Per Number Of Clusters')
In this AIC/BIC plot we can look to see the best prediction model has we increase the numbers of clusters. One of the more concerning aspect of this particular AIC/BIC graph is the amount of clusters required to optimize the model. We see a decrease at 4/5 clusters, however we see a continuous decrease in the scores. This makes it a challenge to have a definitive chose of clusters.
In order to potentially help the gaussian mixture model to better help fit the dataset, utilizing PCA feature reduction will be necessary.
# Reducing the dimensions of the data
pca = PCA(n_components = 2)
X_principal = pca.fit_transform(normalized_df)
X_principal = pd.DataFrame(X_principal)
X_principal.columns = ['P1', 'P2']
X_principal.head(2)
gmm = GaussianMixture(n_components = 8)
gmm.fit(X_principal)
GaussianMixture(n_components=8)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GaussianMixture(n_components=8)
# Visualizing the clustering
plt.scatter(X_principal['P1'], X_principal['P2'],
c = GaussianMixture(n_components = 8).fit_predict(X_principal), cmap =plt.cm.spring, alpha = 0.6)
plt.show()
n_range = range(2,11)
# Create empty lists to store the BIC and AIC values
bic_score = []
aic_score = []
for n in n_range:
gm = GaussianMixture(n_components=n,
random_state=123,
n_init=10)
gm.fit(X_principal)
# Append the BIC and AIC to the respective lists
bic_score.append(gm.bic(X_principal))
aic_score.append(gm.aic(X_principal))
fig, ax = plt.subplots(figsize=(12,8),nrows=1)
ax.plot(n_range, bic_score, '-o', color='orange')
ax.plot(n_range, aic_score, '-o', color='green')
ax.set(xlabel='Number of Clusters', ylabel='Score')
ax.set_xticks(n_range)
ax.set_title('BIC and AIC Scores Per Number Of Clusters')
Text(0.5, 1.0, 'BIC and AIC Scores Per Number Of Clusters')
In the AIC/BIC cluster comparison graph we can see a large decrease in error at 7 clusters. This has beens consistent based on what we have seen with other model. As such, having more that 7 clusters may result in over-fitting.
Keeping PCA for feature reduction consistent, to reduce the number of required clusters, improved the AIC/BIC error scores.
Conclusion The Gaussian Mixture Model is another model that we would consider for this model. Although, we have kept the utilization of PCA consistent through this analysis, we had to use PCA in order to improve performance, which makes interpretation of the data difficult. This makes determining which cluster to use challenging.